rm(list = ls()) # Clear workspace
library(tidyverse)
library(readxl)
library(janitor)
library(skimr)
Data Load: set the data path and view the names of all sheets from the EXCEL table.
data_path = "../data/ai_interview_dataset_1129.xlsx"
sheets <- excel_sheets(data_path)
print("Sheets in the dataset:")
## [1] "Sheets in the dataset:"
sheets # List all sheet names
## [1] "users" "sessions" "technical_questions"
## [4] "behavioral_questions" "system_events" "timeline"
Read each sheet from the excel file:
data_list <- list()
for (sheet in sheets) {
data_list[[sheet]] <- read_excel(data_path, sheet = sheet)
}
Dataset Overview: summarises the basic structure of each dataset, including the number of rows, columns, variable names, and missing rate.
# Overview of each sheet
overview <- list()
for (sheet in sheets) {
df <- data_list[[sheet]]
overview[[sheet]] <- tibble(
rows = nrow(df),
cols = ncol(df),
col_names = paste(names(df), collapse = ", "),
missing_rate = sum(is.na(df)) / (nrow(df) * ncol(df))
)
}
print("Sheet overview:")
[1] “Sheet overview:”
overview
$users # A tibble: 1 × 4 rows cols col_names missing_rate
$sessions # A tibble: 1 × 4 rows cols col_names missing_rate
$technical_questions # A tibble: 1 × 4 rows cols col_names
missing_rate
$behavioral_questions # A tibble: 1 × 4 rows cols col_names
missing_rate
$system_events # A tibble: 1 × 4 rows cols col_names missing_rate
$timeline # A tibble: 1 × 4 rows cols col_names missing_rate
Numeric Variable Summary: select only numeric variables and analyse numeric variables using descriptive statistics.
summary_list <- list()
for (sheet in sheets) {
df <- data_list[[sheet]]
# Select only numeric columns for summary
numeric_df <- df %>% select(where(is.numeric))
# Print sheet name as a header
cat("\n## Sheet:", sheet, "\n")
# Generates information such as mean, median, standard deviation, minimum, maximum, and missing values.
if (ncol(numeric_df) > 0) {
summary_list[[sheet]] <- skim(numeric_df)
print(summary_list[[sheet]])
}
else{
cat("No numeric columns to summarize in this sheet.\n")
}
}
##
## ## Sheet: users
## ── Data Summary ────────────────────────
## Values
## Name numeric_df
## Number of rows 1129
## Number of columns 15
## _______________________
## Column type frequency:
## numeric 15
## ________________________
## Group variables None
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable n_missing complete_rate mean sd p0
## 1 stem_degree_flag 0 1 0.961 0.194 0
## 2 graduation_year 0 1 2023. 1.10 2019
## 3 total_experience_years 0 1 2.83 1.51 0
## 4 relevant_experience_years 0 1 1.92 1.16 0
## 5 internship_count 0 1 2.02 1.12 0
## 6 project_count 0 1 2.49 2.06 1
## 7 leadership_experience_flag 0 1 0.389 0.488 0
## 8 resume_experience_score 0 1 83.3 13.6 41
## 9 skill_match_score 0 1 82.3 14.2 34
## 10 experience_depth_score 0 1 4.13 1.13 1
## 11 experience_consistency_flag 0 1 0.919 0.272 0
## 12 resume_gap_detected 0 1 0.123 0.329 0
## 13 resume_review_time_sec 0 1 198. 59.3 60
## 14 resume_feedback_length_tokens 0 1 342. 141. 80
## 15 overall_resume 0 1 76.7 8.85 46.8
## p25 p50 p75 p100 hist
## 1 1 1 1 1 ▁▁▁▁▇
## 2 2023 2023 2024 2025 ▁▁▂▇▇
## 3 1.83 2.58 3.42 8.5 ▃▇▃▁▁
## 4 1.17 1.75 2.42 7.83 ▇▇▂▁▁
## 5 1 2 3 5 ▇▇▅▂▁
## 6 1 2 3 12 ▇▁▁▁▁
## 7 0 0 1 1 ▇▁▁▁▅
## 8 74 85 96 99 ▁▂▃▆▇
## 9 73 84 95 99 ▁▁▃▆▇
## 10 3 5 5 5 ▁▂▂▃▇
## 11 1 1 1 1 ▁▁▁▁▇
## 12 0 0 0 1 ▇▁▁▁▁
## 13 158 198 237 405 ▃▇▇▂▁
## 14 241 336 434 766 ▅▇▇▃▁
## 15 71 78 83.8 90.5 ▁▂▅▇▇
##
## ## Sheet: sessions
## ── Data Summary ────────────────────────
## Values
## Name numeric_df
## Number of rows 1129
## Number of columns 7
## _______________________
## Column type frequency:
## numeric 7
## ________________________
## Group variables None
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable n_missing complete_rate mean sd p0 p25 p50
## 1 overall_tech_score 0 1 72.2 16.6 29 57 77
## 2 weighted_behavior_score 0 1 75.5 7.75 48 70 75
## 3 pass_flag 0 1 0.671 0.470 0 0 1
## 4 confidence_change_overall 0 1 -0.116 4.97 -10 -4 0
## 5 latency_ms_p50 0 1 43.5 10.3 24 36 43
## 6 latency_ms_p95 0 1 113. 21.6 65 98 111
## 7 messages_exchanged 0 1 49.6 9.79 29 42 50
## p75 p100 hist
## 1 85 100 ▁▅▃▇▆
## 2 81 96 ▁▃▇▇▂
## 3 1 1 ▃▁▁▁▇
## 4 3 10 ▅▇▇▆▃
## 5 51 68 ▅▇▇▅▂
## 6 127 176 ▂▇▇▃▁
## 7 57 70 ▅▇▇▇▅
##
## ## Sheet: technical_questions
## ── Data Summary ────────────────────────
## Values
## Name numeric_df
## Number of rows 3387
## Number of columns 8
## _______________________
## Column type frequency:
## numeric 8
## ________________________
## Group variables None
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable n_missing complete_rate mean sd p0 p25 p50 p75
## 1 order_in_session 0 1 2 0.817 1 1 2 3
## 2 time_to_answer_sec 0 1 519. 277. 70 312. 474 683
## 3 question_score 0 1 66.3 21.1 13 47 74 83
## 4 correctness_flag 0 1 0.610 0.488 0 0 1 1
## 5 followup_turns 0 1 3.01 1.44 1 2 3 4
## 6 analysis_depth 0 1 3.29 1.26 1 2 3 4
## 7 overthinking_flag 0 1 0.122 0.328 0 0 0 0
## 8 statistical_depth 2574 0.240 3.01 0.642 2 3 3 3
## p100 hist
## 1 3 ▇▁▇▁▇
## 2 1409 ▆▇▃▂▁
## 3 100 ▁▅▃▇▆
## 4 1 ▅▁▁▁▇
## 5 6 ▇▅▅▂▂
## 6 6 ▇▇▇▃▁
## 7 1 ▇▁▁▁▁
## 8 4 ▃▁▇▁▃
##
## ## Sheet: behavioral_questions
## ── Data Summary ────────────────────────
## Values
## Name numeric_df
## Number of rows 3387
## Number of columns 17
## _______________________
## Column type frequency:
## numeric 17
## ________________________
## Group variables None
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable n_missing complete_rate mean sd
## 1 order_in_session 0 1 2 0.817
## 2 response_time_sec 0 1 238. 42.3
## 3 response_length_tokens 0 1 197. 44.3
## 4 response_sentiment_score 0 1 0.0793 0.259
## 5 hesitation_flag 0 1 0.244 0.429
## 6 followup_depth 0 1 1.49 1.12
## 7 communication_clarity_score 0 1 3.70 0.975
## 8 teamwork_score 0 1 3.69 0.967
## 9 conflict_resolution_score 0 1 3.56 0.967
## 10 leadership_signal_score 0 1 3.42 1.02
## 11 problem_structuring_score 0 1 3.65 0.988
## 12 professionalism_score 0 1 3.82 0.945
## 13 behavioral_strength_flag 0 1 0.535 0.499
## 14 behavioral_risk_flag 0 1 0.0466 0.211
## 15 overall_behavioral_score_component 0 1 75.5 12.2
## 16 weighted_behavior_score 0 1 75.5 7.74
## 17 confidence_change_behavioral 0 1 0.243 2.69
## p0 p25 p50 p75 p100 hist
## 1 1 1 2 3 3 ▇▁▇▁▇
## 2 160 203 239 273 337 ▆▇▇▇▂
## 3 120 160 196 233 300 ▇▇▇▇▂
## 4 -0.4 -0.13 0.08 0.3 0.6 ▆▇▇▇▅
## 5 0 0 0 0 1 ▇▁▁▁▂
## 6 0 0 1 3 3 ▇▇▁▇▇
## 7 1 3 4 4 5 ▁▂▆▇▅
## 8 1 3 4 4 5 ▁▂▆▇▅
## 9 1 3 4 4 5 ▁▂▇▇▃
## 10 1 3 3 4 5 ▁▃▇▇▃
## 11 1 3 4 4 5 ▁▂▆▇▅
## 12 1 3 4 5 5 ▁▂▆▇▆
## 13 0 0 1 1 1 ▇▁▁▁▇
## 14 0 0 0 0 1 ▇▁▁▁▁
## 15 33 67 76 84 98 ▁▂▆▇▅
## 16 48 70 75 81 96 ▁▃▇▇▂
## 17 -4 -2 0 2 6 ▇▇▇▅▂
##
## ## Sheet: system_events
## No numeric columns to summarize in this sheet.
##
## ## Sheet: timeline
## No numeric columns to summarize in this sheet.
The results include mean, median, standard deviation, minimum, maximum, and missing value information for each numeric variable.
Suspicious points: Sheet users 1. From the chart, we can see that the highest value of total_experience_years is 8.5. But the codebook specifies that the upper limit for the early-career focus is 5 years. 2. The project_count is at its maximum of 12, with an average of 2.5, which is above the normal range. 3. The maximum value of resume_feedback_length_tokens is 766. We can conduct a visual analysis of outliers to determine if it is reasonable.
Sheet technical_questions 1. The percentage of statistical_depth’s missing values is extremely high. But the codebook indicates that [DS-SPECIFIC COLUMNS] (NULL for SDE track). Therefore, this absence might be reasonable due to the issue with the SDE track, and it is not a data error. 2. The highest value of time_to_answer_sec is 1409. But the codebook specifies that the range of this is from 15s to 360s.
clean_for_plot <- function(df, x, y = NULL){
# This function filters out rows with non-finite values in the specified columns for plotting.
if (is.null(y)) {
df %>% filter( is.finite(.data[[x]])) # Filter out rows where the x column has non-finite values (NA, Inf, -Inf)
} else {
df %>% filter( is.finite(.data[[x]]), is.finite(.data[[y]])) # Filter out rows where either x or y column has non-finite values
}
}
# Data cleaning
cleaned_users <- data_list[["users"]] %>%
mutate(total_experience_years = ifelse(total_experience_years > 5, 5, total_experience_years)) # Cap experience at 5 years (upper limit is 5 years)
data_list[["users"]] <- cleaned_users
cleaned_technical_questions <- data_list[["technical_questions"]] %>%
mutate(time_to_answer_sec = ifelse(time_to_answer_sec > 360, 360, time_to_answer_sec)) # Cap time at 360 seconds (upper limit is 360 seconds)
data_list[["technical_questions"]] <- cleaned_technical_questions
# After cleaning, we can check the summary statistics again to see the effect of capping
summary_list_clean <- list()
for (sheet in sheets) {
df <- data_list[[sheet]]
numeric_df <- df %>% select(where(is.numeric))
cat("\n## Sheet:", sheet, "after cleaning\n")
if (ncol(numeric_df) > 0) {
summary_list_clean[[sheet]] <- skim(numeric_df)
print(summary_list_clean[[sheet]])
}
else{
cat("No numeric columns to summarize in this sheet.\n")
}
}
##
## ## Sheet: users after cleaning
## ── Data Summary ────────────────────────
## Values
## Name numeric_df
## Number of rows 1129
## Number of columns 15
## _______________________
## Column type frequency:
## numeric 15
## ________________________
## Group variables None
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable n_missing complete_rate mean sd p0
## 1 stem_degree_flag 0 1 0.961 0.194 0
## 2 graduation_year 0 1 2023. 1.10 2019
## 3 total_experience_years 0 1 2.71 1.23 0
## 4 relevant_experience_years 0 1 1.92 1.16 0
## 5 internship_count 0 1 2.02 1.12 0
## 6 project_count 0 1 2.49 2.06 1
## 7 leadership_experience_flag 0 1 0.389 0.488 0
## 8 resume_experience_score 0 1 83.3 13.6 41
## 9 skill_match_score 0 1 82.3 14.2 34
## 10 experience_depth_score 0 1 4.13 1.13 1
## 11 experience_consistency_flag 0 1 0.919 0.272 0
## 12 resume_gap_detected 0 1 0.123 0.329 0
## 13 resume_review_time_sec 0 1 198. 59.3 60
## 14 resume_feedback_length_tokens 0 1 342. 141. 80
## 15 overall_resume 0 1 76.7 8.85 46.8
## p25 p50 p75 p100 hist
## 1 1 1 1 1 ▁▁▁▁▇
## 2 2023 2023 2024 2025 ▁▁▂▇▇
## 3 1.83 2.58 3.42 5 ▂▆▇▅▅
## 4 1.17 1.75 2.42 7.83 ▇▇▂▁▁
## 5 1 2 3 5 ▇▇▅▂▁
## 6 1 2 3 12 ▇▁▁▁▁
## 7 0 0 1 1 ▇▁▁▁▅
## 8 74 85 96 99 ▁▂▃▆▇
## 9 73 84 95 99 ▁▁▃▆▇
## 10 3 5 5 5 ▁▂▂▃▇
## 11 1 1 1 1 ▁▁▁▁▇
## 12 0 0 0 1 ▇▁▁▁▁
## 13 158 198 237 405 ▃▇▇▂▁
## 14 241 336 434 766 ▅▇▇▃▁
## 15 71 78 83.8 90.5 ▁▂▅▇▇
##
## ## Sheet: sessions after cleaning
## ── Data Summary ────────────────────────
## Values
## Name numeric_df
## Number of rows 1129
## Number of columns 7
## _______________________
## Column type frequency:
## numeric 7
## ________________________
## Group variables None
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable n_missing complete_rate mean sd p0 p25 p50
## 1 overall_tech_score 0 1 72.2 16.6 29 57 77
## 2 weighted_behavior_score 0 1 75.5 7.75 48 70 75
## 3 pass_flag 0 1 0.671 0.470 0 0 1
## 4 confidence_change_overall 0 1 -0.116 4.97 -10 -4 0
## 5 latency_ms_p50 0 1 43.5 10.3 24 36 43
## 6 latency_ms_p95 0 1 113. 21.6 65 98 111
## 7 messages_exchanged 0 1 49.6 9.79 29 42 50
## p75 p100 hist
## 1 85 100 ▁▅▃▇▆
## 2 81 96 ▁▃▇▇▂
## 3 1 1 ▃▁▁▁▇
## 4 3 10 ▅▇▇▆▃
## 5 51 68 ▅▇▇▅▂
## 6 127 176 ▂▇▇▃▁
## 7 57 70 ▅▇▇▇▅
##
## ## Sheet: technical_questions after cleaning
## ── Data Summary ────────────────────────
## Values
## Name numeric_df
## Number of rows 3387
## Number of columns 8
## _______________________
## Column type frequency:
## numeric 8
## ________________________
## Group variables None
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
## 1 order_in_session 0 1 2 0.817 1 1 2 3 3
## 2 time_to_answer_sec 0 1 319. 77.6 70 312. 360 360 360
## 3 question_score 0 1 66.3 21.1 13 47 74 83 100
## 4 correctness_flag 0 1 0.610 0.488 0 0 1 1 1
## 5 followup_turns 0 1 3.01 1.44 1 2 3 4 6
## 6 analysis_depth 0 1 3.29 1.26 1 2 3 4 6
## 7 overthinking_flag 0 1 0.122 0.328 0 0 0 0 1
## 8 statistical_depth 2574 0.240 3.01 0.642 2 3 3 3 4
## hist
## 1 ▇▁▇▁▇
## 2 ▁▁▁▁▇
## 3 ▁▅▃▇▆
## 4 ▅▁▁▁▇
## 5 ▇▅▅▂▂
## 6 ▇▇▇▃▁
## 7 ▇▁▁▁▁
## 8 ▃▁▇▁▃
##
## ## Sheet: behavioral_questions after cleaning
## ── Data Summary ────────────────────────
## Values
## Name numeric_df
## Number of rows 3387
## Number of columns 17
## _______________________
## Column type frequency:
## numeric 17
## ________________________
## Group variables None
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable n_missing complete_rate mean sd
## 1 order_in_session 0 1 2 0.817
## 2 response_time_sec 0 1 238. 42.3
## 3 response_length_tokens 0 1 197. 44.3
## 4 response_sentiment_score 0 1 0.0793 0.259
## 5 hesitation_flag 0 1 0.244 0.429
## 6 followup_depth 0 1 1.49 1.12
## 7 communication_clarity_score 0 1 3.70 0.975
## 8 teamwork_score 0 1 3.69 0.967
## 9 conflict_resolution_score 0 1 3.56 0.967
## 10 leadership_signal_score 0 1 3.42 1.02
## 11 problem_structuring_score 0 1 3.65 0.988
## 12 professionalism_score 0 1 3.82 0.945
## 13 behavioral_strength_flag 0 1 0.535 0.499
## 14 behavioral_risk_flag 0 1 0.0466 0.211
## 15 overall_behavioral_score_component 0 1 75.5 12.2
## 16 weighted_behavior_score 0 1 75.5 7.74
## 17 confidence_change_behavioral 0 1 0.243 2.69
## p0 p25 p50 p75 p100 hist
## 1 1 1 2 3 3 ▇▁▇▁▇
## 2 160 203 239 273 337 ▆▇▇▇▂
## 3 120 160 196 233 300 ▇▇▇▇▂
## 4 -0.4 -0.13 0.08 0.3 0.6 ▆▇▇▇▅
## 5 0 0 0 0 1 ▇▁▁▁▂
## 6 0 0 1 3 3 ▇▇▁▇▇
## 7 1 3 4 4 5 ▁▂▆▇▅
## 8 1 3 4 4 5 ▁▂▆▇▅
## 9 1 3 4 4 5 ▁▂▇▇▃
## 10 1 3 3 4 5 ▁▃▇▇▃
## 11 1 3 4 4 5 ▁▂▆▇▅
## 12 1 3 4 5 5 ▁▂▆▇▆
## 13 0 0 1 1 1 ▇▁▁▁▇
## 14 0 0 0 0 1 ▇▁▁▁▁
## 15 33 67 76 84 98 ▁▂▆▇▅
## 16 48 70 75 81 96 ▁▃▇▇▂
## 17 -4 -2 0 2 6 ▇▇▇▅▂
##
## ## Sheet: system_events after cleaning
## No numeric columns to summarize in this sheet.
##
## ## Sheet: timeline after cleaning
## No numeric columns to summarize in this sheet.
Create Image: Create histograms for each numeric column in each sheet
# Create histograms for each numeric column in each sheet
for (sheet in sheets) {
df <- data_list[[sheet]]
numeric_df <- df %>% select(where(is.numeric))
if (ncol(numeric_df) > 0) {
for (col_name in names(numeric_df)){
df2 <- clean_for_plot(df, col_name) # Clean data for plotting
p.hist <- ggplot(df2, aes(x = .data[[col_name]])) +
geom_histogram(binwidth = 1, fill = "blue", color = "black") +
labs(title = paste("Histogram of", col_name, "in sheet", sheet), x = col_name, y = "Count")
print(p.hist)
}
} }
# Create boxplots for each numeric column in each sheet
for (sheet in sheets) {
df <- data_list[[sheet]]
numeric_df <- df %>% select(where(is.numeric))
# Create boxplots for each numeric column in the sheet (without grouping)
if (ncol(numeric_df) > 0) {
for (col_name in names(numeric_df)){
df2 <- clean_for_plot(df, col_name) # Clean data for plotting
p.box.single <- ggplot(df2, aes(x = factor(1), y = .data[[col_name]])) +
geom_boxplot(outlier.colour = "purple", fill = "blue", color = "black") +
stat_summary(fun = median, geom = "point", shape = 20, size = 3, color = "red") + # Add median point
stat_summary(fun = mean, geom = "point", shape = 20, size = 3, color = "green") + # Add mean point
labs(title = paste("Boxplot of", col_name, "in sheet", sheet), x = "", y = col_name)
print(p.box.single)
}
}
}
# Create boxplots for each numeric column grouped by "track" if "track" column exists
for (sheet in sheets) {
df <- data_list[[sheet]]
numeric_df <- df %>% select(where(is.numeric))
# Check if "track" column exists and there are numeric columns to plot
if ("track" %in% names(df) & ncol(numeric_df) > 0) {
# Convert "track" to factor for better plotting
df$track <- as.factor(df$track) # Making sure "track" is treated as a categorical variable
for (col_name in names(numeric_df)){
df2 <- clean_for_plot(df, col_name) # Clean data for plotting
df2 <- df2 %>% filter(!is.na(track)) # Filter out rows where "track" is NA
p.box.track <- ggplot(df2, aes(x = track, y = .data[[col_name]])) +
geom_boxplot(aes(fill = track), outlier.colour = "purple") +
stat_summary(fun = median, geom = "point", shape = 20, size = 3, color = "red") + # Add median point
stat_summary(fun = mean, geom = "point", shape = 20, size = 3, color = "green") + # Add mean point
labs(title = paste("Boxplot of", col_name, "by track in sheet", sheet), x = "Track",y = col_name)
print(p.box.track)
}
}
}
# Create boxplots for each numeric column grouped by major groups if they exist
major_groups <- c("track", "device_type", "timezone") # Define major groups for analysis
for (sheet in sheets) {
df <- data_list[[sheet]]
numeric_df <- df %>% select(where(is.numeric))
# Create boxplots for each numeric column grouped by major groups if they exist
if (ncol(numeric_df) > 0) { # Only create boxplots if there are numeric columns to plot
for (col_name in names(numeric_df)) {
# Loop through each major group and create boxplots if the group column exists in the dataframe
for (group in major_groups) {
if (group %in% names(df)) {
df2 <- clean_for_plot(df, col_name) # Clean data for plotting
df2 <- df2 %>% filter(!is.na(.data[[group]])) # Filter out rows where the group variable is NA
p.box.group <- ggplot(df2, aes(x = .data[[group]], y = .data[[col_name]])) +
geom_boxplot(aes(fill = .data[[group]]), outlier.colour = "purple") +
stat_summary(fun = median, geom = "point", shape = 20, size = 3, color = "red") + # Add median point
stat_summary(fun = mean, geom = "point", shape = 20, size = 3, color = "green") + # Add mean point
labs(title = paste("Boxplot of", col_name, "by", group, "in sheet", sheet), x = group, y = col_name)
print(p.box.group)
}
}
}
}
}
# Create scatter plots for each pair of numeric columns in each sheet, colored by "track" if it exists, and calculate correlation
for (sheet in sheets) {
df <- data_list[[sheet]]
numeric_df <- df %>% select(where(is.numeric))
if (ncol(numeric_df) > 1) { # Only create scatter plots if there are at least 2 numeric columns
numeric_cols <- names(numeric_df)
for (i in 1:(length(numeric_cols)-1)) {
for (j in (i+1):length(numeric_cols)) {
x <- numeric_cols[i]
y <- numeric_cols[j]
df2 <- clean_for_plot(df, x, y) # Clean data for plotting
cor_val <- cor(df2[[x]], df2[[y]], use = "complete.obs") # Calculate correlation, excluding NA values
if ("track" %in% names(df2)) {
df2 <- df2 %>% filter(!is.na(track)) # Filter out rows where "track" is NA
df2$track <- as.factor(df2$track) # Ensure "track" is treated as a categorical variable
if (length(levels(df2$track)) > 1) {
p.scatter <- ggplot(df2, aes(x = .data[[x]], y = .data[[y]])) +
geom_point(aes(color = track)) +
geom_smooth(method = "lm", se = FALSE) + # Add linear regression line
labs(color = "Track", title = paste("Scatter plot of", x, "vs", y, "in sheet", sheet, "\nCorrelation:", round(cor_val, 2)),
x = x, y = y
)
} else {
p.scatter <- ggplot(df2, aes(x = .data[[x]], y = .data[[y]])) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) + # Add linear regression line
labs(color = "Device Type",
title = paste("Scatter plot of", x, "vs", y, "by device_type in sheet", sheet,
"\nCorrelation:", round(cor_val, 2)),
x = x, y = y)
}
print(p.scatter)
}
}
}
}
}
# Scatter plots colored by device_type
for (sheet in sheets) {
df <- data_list[[sheet]]
numeric_df <- df %>% select(where(is.numeric))
if (ncol(numeric_df) > 1) {
numeric_cols <- names(numeric_df)
for (i in 1:(length(numeric_cols)-1)) {
for (j in (i+1):length(numeric_cols)) {
x <- numeric_cols[i]
y <- numeric_cols[j]
df2 <- clean_for_plot(df, x, y)
if ("device_type" %in% names(df2)) {
df2 <- df2 %>% filter(!is.na(device_type))
df2$device_type <- as.factor(df2$device_type)
if (length(levels(df2$device_type)) > 1) {
cor_val <- cor(df2[[x]], df2[[y]], use = "complete.obs")
p.scatter <- ggplot(df2, aes(x = .data[[x]], y = .data[[y]])) +
geom_point(aes(color = device_type)) +
geom_smooth(method = "lm", se = FALSE) +
labs(color = "Device Type",
title = paste("Scatter plot of", x, "vs", y, "by device_type in sheet", sheet,
"\nCorrelation:", round(cor_val, 2)),
x = x, y = y)
print(p.scatter)
}
}
}
}
}
}
# Scatter plots colored by timezone
for (sheet in sheets) {
df <- data_list[[sheet]]
numeric_df <- df %>% select(where(is.numeric))
if (ncol(numeric_df) > 1) {
numeric_cols <- names(numeric_df)
for (i in 1:(length(numeric_cols)-1)) {
for (j in (i+1):length(numeric_cols)) {
x <- numeric_cols[i]
y <- numeric_cols[j]
df2 <- clean_for_plot(df, x, y)
if ("timezone" %in% names(df2)) {
df2 <- df2 %>% filter(!is.na("timezone"))
df2$timezone <- as.factor(df2$timezone)
if (length(levels(df2$timezone)) > 1) {
cor_val <- cor(df2[[x]], df2[[y]], use = "complete.obs")
p.scatter <- ggplot(df2, aes(x = .data[[x]], y = .data[[y]])) +
geom_point(aes(color = timezone)) +
geom_smooth(method = "lm", se = FALSE) +
labs(color = "Timezone",
title = paste("Scatter plot of", x, "vs", y, "by timezone in sheet", sheet,
"\nCorrelation:", round(cor_val, 2)),
x = x, y = y)
print(p.scatter)
}
}
}
}
}
}